library(ISLR2)
library(car)
library(np)
library(splines)
library(fda)
During the last lab we have familiarized with the two simplest ways to move beyond linearity in (univariate) regression, namely with polynomial and step functions. In this lab, we will look at two more methods that provide non-linear fitting: local regression and splines. While the former relies on local fitting (as the name suggests), splines will allow to merge together polynomial and step functions providing a powerful linear smoother.
Local regression involves computing the fit at a target point \(x_0\) using only the nearby training observations. Local regression is sometimes referred to as a memory-based procedure because, like nearest-neighbors, we need all the training data each time we wish to compute a prediction. Let us keep working with the Prestige dataset. We start with weighted local averaging with uniform (or rectangular) kernel.
m_loc = npreg(prestige ~ income,
ckertype = 'uniform',
bws = 3200, # bandwidth
data = Prestige)
income_newdata=data.frame(income=with(Prestige, seq(range(income)[1],range(income)[2],by=0.5)))
preds=predict(m_loc,newdata=income_newdata,se=T)
se.bands=cbind(preds$fit +2* preds$se.fit ,preds$fit -2* preds$se.fit)
with(
Prestige,
plot(
income ,
prestige ,
xlim = range(income_newdata$income) ,
cex = .5,
col = " darkgrey ",
main = 'Local Averaging - bws3200 - Uniform kernel'
)
)
lines(income_newdata$income,preds$fit ,lwd =2, col =" blue")
matlines(income_newdata$income,se.bands ,lwd =1, col =" blue",lty =3)
This can be manually implemented in a straightforward way. Spoiler alert: the code below is very bad in terms of efficiency and portability, it is only shown for didactic purposes!
loc_pred_unif_manual <- Vectorize(function(x_0, bw) {
ind_unit_in_bw <-
which(Prestige$income > (x_0 - bw) & Prestige$income < (x_0 + bw))
Prestige %>%
dplyr::slice(ind_unit_in_bw) %>%
dplyr::summarise(mean(prestige)) %>%
dplyr::pull()
}, vectorize.args = "x_0")
# Predict original data
preds <- predict(m_loc, newdata=data.frame(income=Prestige$income))
all(dplyr::near(preds, loc_pred_unif_manual(x_0 = Prestige$income,bw = 3200), tol = 1e-13))
## [1] TRUE
Let us try to decrease the bandwidth
m_loc = npreg(prestige ~ income,
ckertype = 'uniform',
bws = 1000, # bandwidth
data = Prestige)
income_newdata=data.frame(income=with(Prestige, seq(range(income)[1],range(income)[2],by=0.5)))
preds=predict(m_loc,newdata=income_newdata,se=T)
se.bands=cbind(preds$fit +2* preds$se.fit ,preds$fit -2* preds$se.fit)
with(
Prestige,
plot(
income ,
prestige ,
xlim = range(income_newdata$income) ,
cex = .5,
col = " darkgrey ",
main = 'Local Averaging - bws1000 - Uniform kernel'
)
)
lines(income_newdata$income,preds$fit ,lwd =2, col =" blue")
matlines(income_newdata$income,se.bands ,lwd =1, col =" blue",lty =3)
We have issues with uniform kernel if there are no data in the bins…
m_loc = npreg(prestige ~ income,
ckertype = 'uniform',
bws = 5000, # bandwidth
data = Prestige)
income_newdata=data.frame(income=with(Prestige, seq(range(income)[1],range(income)[2],by=0.5)))
preds=predict(m_loc,newdata=income_newdata,se=T)
se.bands=cbind(preds$fit +2* preds$se.fit ,preds$fit -2* preds$se.fit)
with(
Prestige,
plot(
income ,
prestige ,
xlim = range(income_newdata$income) ,
cex = .5,
col = " darkgrey ",
main = 'Local Averaging - bws5000 - Uniform kernel'
)
)
lines(income_newdata$income,preds$fit ,lwd =2, col =" blue")
matlines(income_newdata$income,se.bands ,lwd =1, col =" blue",lty =3)
An option could be to change kernel:
m_loc = npreg(prestige ~ income,
ckertype = 'gaussian',
bws = 3200, # bandwidth
data = Prestige)
income_newdata=data.frame(income=with(Prestige, seq(range(income)[1],range(income)[2],by=0.5)))
preds=predict(m_loc,newdata=income_newdata,se=T)
se.bands=cbind(preds$fit +2* preds$se.fit ,preds$fit -2* preds$se.fit)
with(
Prestige,
plot(
income ,
prestige ,
xlim = range(income_newdata$income) ,
cex = .5,
col = " darkgrey ",
main = 'Local Averaging - bws3200 - Gaussian kernel'
)
)
lines(income_newdata$income,preds$fit ,lwd =2, col =" blue")
matlines(income_newdata$income,se.bands ,lwd =1, col =" blue",lty =3)
Notice that the same results could be achieved by means of the locpoly function in the KernSmooth package:
m_kern_smooth <- with(Prestige,KernSmooth::locpoly(x = income, y = prestige, bandwidth = 3200, degree = 0,
range.x = c(range(income)[1], range(income)[2]), gridsize = nrow(income_newdata)))
all(dplyr::near(m_kern_smooth$y,preds$fit,tol = 1e-1)) # somewhat different approx up to .1
## [1] TRUE
Let us try to decrease the bandwidth
m_loc = npreg(prestige ~ income,
ckertype = 'gaussian',
bws = 1000, # bandwidth
data = Prestige)
income_newdata=data.frame(income=with(Prestige, seq(range(income)[1],range(income)[2],by=0.5)))
preds=predict(m_loc,newdata=income_newdata,se=T)
se.bands=cbind(preds$fit +2* preds$se.fit ,preds$fit -2* preds$se.fit)
with(
Prestige,
plot(
income ,
prestige ,
xlim = range(income_newdata$income) ,
cex = .5,
col = " darkgrey ",
main = 'Local Averaging - bws1000 - Gaussian kernel'
)
)
lines(income_newdata$income,preds$fit ,lwd =2, col =" blue")
matlines(income_newdata$income,se.bands ,lwd =1, col =" blue",lty =3)
m_loc = npreg(prestige ~ income,
ckertype = 'gaussian',
bws = 5000, # bandwidth
data = Prestige)
income_newdata=data.frame(income=with(Prestige, seq(range(income)[1],range(income)[2],by=0.5)))
preds=predict(m_loc,newdata=income_newdata,se=T)
se.bands=cbind(preds$fit +2* preds$se.fit ,preds$fit -2* preds$se.fit)
with(
Prestige,
plot(
income ,
prestige ,
xlim = range(income_newdata$income) ,
cex = .5,
col = " darkgrey ",
main = 'Local Averaging - bws5000 - Gaussian kernel'
)
)
lines(income_newdata$income,preds$fit ,lwd =2, col =" blue")
matlines(income_newdata$income,se.bands ,lwd =1, col =" blue",lty =3)
Last note: a very nice interactive visualization on how local regression works is available here (Figure 6.6).
Try to perform again the analysis employing the last continuous kernel type available in the np package, namely the epanechnikov kernel. Do the results change much with respect to the Gaussian kernel?
Splines are piecewise polynomial functions which are constrained to join smoothly at knots. Nevertheless, before looking at splines let us empirically motivate their need by considering piecewise polynomials first. Let us keep working with our Prestige dataset. There is no easy way to do it, so we hard-code it
cutoff <- 10000
Prestige$income_cut <- Prestige$income>cutoff
Prestige$income_cut_model <- (Prestige$income-cutoff)*Prestige$income_cut
model_cut=lm(prestige ~ income + income_cut_model, data=Prestige)
new_data <-
with(Prestige, data.frame(
income = seq(range(income)[1], range(income)[2], by = 0.5)
))
new_data$income_cut_model = (new_data$income - cutoff) * (new_data$income > cutoff)
preds_cut=predict(model_cut,new_data ,se=T)
se.bands_cut=cbind(preds_cut$fit +2* preds_cut$se.fit ,preds_cut$fit -2* preds$se.fit)
with(Prestige, plot(income ,prestige ,xlim=range(new_data$income) ,cex =.5, col =" darkgrey " ))
lines(new_data$income,preds_cut$fit ,lwd =2, col =" blue")
matlines(new_data$income, se.bands_cut ,lwd =1, col =" blue",lty =3)
We can allow the piecewise polynomial to be discontinuous
model_cut_disc=lm(prestige ~ income + income_cut_model + I(income>cutoff), data=Prestige)
preds=predict(model_cut_disc,new_data ,se=T)
se.bands=cbind(preds$fit +2* preds$se.fit ,preds$fit -2* preds$se.fit)
with(Prestige, plot(income ,prestige ,xlim=range(new_data$income) ,cex =.5, col =" darkgrey " ))
lines(new_data$income,preds$fit ,lwd =2, col =" blue")
matlines(new_data$income, se.bands ,lwd =1, col =" blue",lty =3)
And consider different functional forms
model_cut_quad <- lm(prestige ~ poly(income,degree = 2) + income_cut_model + I(income>cutoff), data=Prestige)
preds=predict(model_cut_quad,new_data,se=T)
se.bands=cbind(preds$fit +2* preds$se.fit ,preds$fit -2* preds$se.fit)
with(Prestige, plot(income ,prestige ,xlim=range(new_data$income) ,cex =.5, col =" darkgrey " ))
lines(new_data$income,preds$fit ,lwd =2, col =" blue")
matlines(new_data$income, se.bands ,lwd =1, col =" blue",lty =3)
How to move forward in this direction? That is, allowing for different functional forms in different bins whilst maintaining some degree of smoothness in the interpolation? Splines! For doing so, we use the built-in splines package. The idea underlying regression splines relies on specification of a set of knots, producing sequence of basis functions and then using least squares for estimating coefficients.
model_linear_spline <- lm(prestige ~ bs(income, knots=10000,degree=1), data=Prestige)
preds=predict(model_linear_spline,new_data,se=T)
se.bands=cbind(preds$fit +2* preds$se.fit ,preds$fit -2* preds$se.fit)
with(Prestige, plot(income ,prestige ,xlim=range(new_data$income) ,cex =.5, col =" darkgrey " ))
lines(new_data$income,preds$fit ,lwd =2, col =" blue")
matlines(new_data$income, se.bands ,lwd =1, col =" blue",lty =3)
Recall that we can always look at the design matrix (and how the bs() command builds the polynomial splines) with the model.matrix() function
head(unname(model.matrix(model_linear_spline)))
## [,1] [,2] [,3]
## [1,] 1 0.8519428 0.14805718
## [2,] 1 0.0000000 1.00000000
## [3,] 1 0.9223559 0.00000000
## [4,] 1 0.8791139 0.00000000
## [5,] 1 0.8299073 0.00000000
## [6,] 1 0.9351345 0.06486555
Notice that predictions made with the hard-coded model_cut model is the same as the model_linear_spline one:
all(dplyr::near(x = preds_cut$fit,preds$fit))
## [1] TRUE
Yet, the model coefficients are different:
rbind(model_cut=coef(model_cut),
model_linear_spline=coef(model_linear_spline))
## (Intercept) income income_cut_model
## model_cut 17.30643 0.004700415 -0.003565982
## model_linear_spline 20.17838 44.132196319 62.145854931
The reason is due to the different set of basis used to represent the space of spline functions: the former employs a conceptually simple (and easy to code) truncated power basis, whereas the latter uses a computationally more efficient B-spline basis (more on this later in the lab).
Flexibility? The sky is the limit here. Let us build a piecewise linear spline
inc_breaks <- c(seq(0, 10000, by = 2500)[-1], 15000)
model_linear_spline_2 <-
lm(prestige ~ bs(income, knots = inc_breaks, degree = 1), data = Prestige)
preds=predict(model_linear_spline_2, new_data,se=T)
se.bands=cbind(preds$fit +2* preds$se.fit ,preds$fit -2* preds$se.fit)
with(Prestige, plot(income ,prestige ,xlim=range(new_data$income) ,cex =.5, col =" darkgrey " ))
lines(new_data$income,preds$fit ,lwd =2, col =" blue")
matlines(new_data$income, se.bands ,lwd =1, col =" blue",lty =3)
Piecewise quadratic
model_quad_splines <-
lm(prestige ~ bs(income, knots = inc_breaks, degree = 2), data = Prestige)
preds=predict(model_quad_splines, new_data,se=T)
se.bands=cbind(preds$fit +2* preds$se.fit ,preds$fit -2* preds$se.fit)
with(Prestige, plot(income ,prestige ,xlim=range(new_data$income) ,cex =.5, col =" darkgrey " ))
lines(new_data$income,preds$fit ,lwd =2, col =" blue")
matlines(new_data$income, se.bands ,lwd =1, col =" blue",lty =3)
Cubic spline
model_cubic_splines <-
lm(prestige ~ bs(income, knots = inc_breaks, degree = 3), data = Prestige)
preds=predict(model_cubic_splines, new_data,se=T)
se.bands=cbind(preds$fit +2* preds$se.fit ,preds$fit -2* preds$se.fit)
with(Prestige, plot(income ,prestige ,xlim=range(new_data$income) ,cex =.5, col =" darkgrey " ))
lines(new_data$income,preds$fit ,lwd =2, col =" blue")
matlines(new_data$income, se.bands ,lwd =1, col =" blue",lty =3)
I can either specify the knots (as done so far), or have R automatically select the spacing, I just need to specify their number knowing that: \[\#dof = \#knots + degree\]
Example: if we want \(4\) knots and a cubic spline, we will end up with \(7\) degrees of freedom.
model_cubic_splines_2 <-
lm(prestige ~ bs(income, degree = 3,df = 7), data = Prestige)
preds=predict(model_cubic_splines_2, new_data,se=T)
se.bands=cbind(preds$fit +2* preds$se.fit ,preds$fit -2* preds$se.fit)
with(Prestige, plot(income ,prestige ,xlim=range(new_data$income) ,cex =.5, col =" darkgrey " ))
lines(new_data$income,preds$fit ,lwd =2, col =" blue")
matlines(new_data$income, se.bands ,lwd =1, col =" blue",lty =3)
knots <- attr(bs(Prestige$income, degree=3,df=7),'knots')
knots_pred=predict(model_cubic_splines_2,list(income=knots))
points(knots,knots_pred, col='blue',pch=19)
abline(v = knots, lty=3)
We can visualize how B-spline bases look like in the unit interval:
splines_grid <- seq(0, 1, by=0.001)
lin_spline <- bs(splines_grid, df=7, degree = 1)
quad_spline <- bs(splines_grid, df=8, degree = 2)
cub_spline <- bs(splines_grid, df=9, degree = 3)
knots_splines <- attributes(lin_spline)$knots
par(mfrow=c(3,1))
plot(lin_spline[,1]~splines_grid, ylim=c(0,max(lin_spline)), type='l', lwd=2, col=1,
xlab="", ylab="", main="Linear B-spline basis")
for (j in 2:ncol(lin_spline)) lines(lin_spline[,j]~splines_grid, lwd=2, col=j)
points(knots_splines,rep(0,length(knots_splines)),pch=19)
abline(v = knots_splines,lty=3)
plot(quad_spline[,1]~splines_grid, ylim=c(0,max(quad_spline)), type='l', lwd=2, col=1,
xlab="", ylab="", main="Quadratic B-spline basis")
for (j in 2:ncol(quad_spline)) lines(quad_spline[,j]~splines_grid, lwd=2, col=j)
points(knots_splines,rep(0,length(knots_splines)),pch=19)
abline(v = knots_splines,lty=3)
plot(cub_spline[,1]~splines_grid, ylim=c(0,max(cub_spline)), type='l', lwd=2, col=1,
xlab="", ylab="", main="Cubic B-spline basis")
for (j in 2:ncol(cub_spline)) lines(cub_spline[,j]~splines_grid, lwd=2, col=j)
points(knots_splines,rep(0,length(knots_splines)),pch=19)
abline(v = knots_splines,lty=3)
Notice that, by employing the user-friendly truncated power basis,
\[\begin{aligned} h_{j}(X) &=X^{j-1}, j=1, \ldots, M \\ h_{M+\ell}(X) &=\left(X-\xi_{\ell}\right)_{+}^{M-1}, \ell=1, \ldots, K \end{aligned}\]where M denotes the order of the spline, \(\xi_{\ell}\) the \(l\)-th knot and \(\left(A\right)_{+}=\max(A, 0)\), we can actually hard code the same model as model_cubic_splines_2
# Define trunc power basis
M <- 4 # cubic spline
X_powers <- poly(Prestige$income,degree = (M-1),raw = TRUE)
X_trunc_powers <- sapply(knots, function(k) (Prestige$income-k)^(M-1)*(Prestige$income>k))
X_manual_cubic_splines <- cbind(X_powers, X_trunc_powers)
# Fit the model with OLS
model_cubic_splines_2_manual <- lm(Prestige$prestige ~ X_manual_cubic_splines)
# Construct the new_data_manual object
new_data_manual_powers <- poly(new_data[,1],degree = 3,raw = TRUE)
new_data_manual_trunc_powers <- sapply(knots, function(k) (new_data[,1]-k)^3*(new_data[,1]>k))
new_data_manual <- cbind(new_data_manual_powers, new_data_manual_trunc_powers)
# Make predictions
preds_manual_cubic_splines <- c(cbind(1,new_data_manual)%*%model_cubic_splines_2_manual$coefficients)
# Compare with the previous solution
all(dplyr::near(preds$fit,preds_manual_cubic_splines))
## [1] TRUE
Apart from the considerable coding effort with respect to using the built-in bs() function, values in the coefficients vector and in the design matrix get absurdly low and high in our hard-coded solution… Let us stick with B-spline bases from now on!
How to avoid weird behavior at the boundaries of the domain? We can employ natural splines
knots <- quantile(Prestige$income,probs=c(0.1,0.5,0.9))
boundary_knots <- quantile(Prestige$income,probs=c(0.05,0.95))
model_ns=lm(prestige ~ ns(income,knots=knots,Boundary.knots=boundary_knots), data=Prestige) #defaults to three knots
preds=predict(model_ns, new_data,se=T)
se.bands=cbind(preds$fit +2* preds$se.fit ,preds$fit -2* preds$se.fit)
with(Prestige, plot(income ,prestige ,xlim=range(new_data$income) ,cex =.5, col =" darkgrey " ))
lines(new_data$income,preds$fit ,lwd =2, col =" blue")
matlines(new_data$income, se.bands ,lwd =1, col =" blue",lty =3)
knots_pred=predict(model_ns,list(income=knots))
points(knots,knots_pred, col='blue',pch=19)
boundary_pred <- predict(model_ns,list(income=boundary_knots))
points(boundary_knots,boundary_pred,col='red',pch=19)
abline(v = knots, lty=3, col="blue")
abline(v = boundary_knots, lty=3, col="red")
Clearly, the specification of knots can be non-quantile driven in general.
Play around with the splines functions, applying them to the Wage dataset.
data(Wage)
with(Wage, plot(age,wage))
With smoothing splines we look for a function \(f(\cdot)\) that makes \(RSS=\sum_{i=1}^n(y_i-f(x_i))^2\) small, but that is also smooth. That is, we aim at minimizing
\[\begin{equation} \operatorname{RSS}(f, \lambda)=\sum_{i=1}^{n}\left\{y_{i}-f\left(x_{i}\right)\right\}^{2}+\lambda \int\left\{f^{\prime \prime}(t)\right\}^{2} d t \end{equation}\]This is operationally done by performing a penalized regression over the natural spline basis, placing knots at all the inputs. This means that, remarkably, the problem defined on an infinite-dimensional function space has a finite-dimensional, unique minimizer which is a natural cubic spline! The this result is not even difficult to prove (see Theorem 2.3 of Nonparametric Regression and Generalized Linear Models: A Roughness Penalty Approach by Green and Silverman) This is automatically performed in R through the smooth.spline function.
fit_smooth_spline <- with(Prestige, smooth.spline(income,prestige,df=100))
with(Prestige, plot(income ,prestige, cex =.5, col =" darkgrey "))
lines(fit_smooth_spline,col="blue",lwd=2)
fit_smooth_spline <- with(Prestige, smooth.spline(income,prestige,df=20))
with(Prestige, plot(income ,prestige, cex =.5, col =" darkgrey "))
lines(fit_smooth_spline,col="blue",lwd=2)
Or we can directly specify the value of the smoothing parameter \(\lambda\)
fit_smooth_spline <- with(Prestige, smooth.spline(income,prestige,lambda = 1e-1))
with(Prestige, plot(income ,prestige, cex =.5, col =" darkgrey "))
lines(fit_smooth_spline,col="blue",lwd=2)
fit_smooth_spline <- with(Prestige, smooth.spline(income,prestige,lambda = 1e-6))
with(Prestige, plot(income ,prestige, cex =.5, col =" darkgrey "))
lines(fit_smooth_spline,col="blue",lwd=2)
Generally, one wants to optimize the LOOCV error
\[\begin{equation} \mathcal{V}_{o}=\frac{1}{n} \sum_{i=1}^{n}\left(\hat{f}_{i}^{[-i]}-y_{i}\right)^{2}=\frac{1}{n} \sum_{i=1}^{n}\left(y_{i}-\hat{f}_{i}\right)^{2} /\left(1-A_{i i}\right)^{2} \end{equation}\]or the GCV error
\[\begin{equation} \mathcal{V}_{g}=\frac{n \sum_{i=1}^{n}\left(y_{i}-\hat{f}_{i}\right)^{2}}{[n-\operatorname{tr}(\mathbf{A})]^{2}} \end{equation}\]where \(\boldsymbol{A}\) is the hat matrix of the associated regression problem (see e.g., Section 5.4.1 of The Elements of Statistical Learning by Hastie, Tibshirani and Friedman for further explanation).
fit_smooth_spline_CV <- with(Prestige, smooth.spline(income,prestige,cv = TRUE))
## Warning in smooth.spline(income, prestige, cv = TRUE): cross-validation with
## non-unique 'x' values seems doubtful
fit_smooth_spline_GCV <- with(Prestige, smooth.spline(income,prestige,cv = FALSE))
with(Prestige, plot(income ,prestige, cex =.5, col =" darkgrey "))
lines(fit_smooth_spline,col="red",lwd=2,lty=1)
lines(fit_smooth_spline_GCV,col="blue",lwd=2, lty=2)
legend(20000, 30, legend=c("CV", "GCV"),
col=c("red", "blue"), lty=1:2, cex=0.8)
Likewise for the previous methods, it is very easy to produce forecasts
predict(fit_smooth_spline_GCV,22000)
## $x
## [1] 22000
##
## $y
## [1] 80.55615